/*
 * Decompiled with CFR 0.152.
 */
package net.nooj4nlp.engine;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.html.HTMLEditorKit;
import net.nooj4nlp.engine.DocxToText;
import net.nooj4nlp.engine.helper.ParameterCheck;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;

public class TextIO {
    private static final String UTF_8_ENCODING = "UTF-8";

    private static String loadRawText(String filePath) throws IOException {
        ParameterCheck.mandatoryString("filePath", filePath);
        File file = new File(filePath);
        if (!file.exists()) {
            throw new IOException("Source file on path '" + filePath + "' does not exist.");
        }
        if (!file.isFile()) {
            throw new IOException("Source file on path '" + filePath + "' exists but is not a file.");
        }
        return FileUtils.readFileToString(file, UTF_8_ENCODING);
    }

    private static String loadRawText(String filePath, String encoding) throws IOException {
        ParameterCheck.mandatoryString("filePath", filePath);
        File file = new File(filePath);
        if (!file.exists()) {
            throw new IOException("Source file on path '" + filePath + "' does not exist.");
        }
        if (!file.isFile()) {
            throw new IOException("Source file on path '" + filePath + "' exists but is not a file");
        }
        return FileUtils.readFileToString(file, encoding);
    }

    private static String standardizeText(String buffer, ArrayList<String> charTable) {
        ParameterCheck.mandatoryString("buffer", buffer);
        buffer = buffer.replace("\r", "");
        if (charTable == null) {
            return buffer;
        }
        String res = buffer;
        for (int i = 0; i < charTable.size(); i += 2) {
            String pattern = charTable.get(i);
            String replace = charTable.get(i + 1);
            int start = res.indexOf(pattern, 0);
            if (start < 0) continue;
            int currentpos = 0;
            StringBuilder tmp = new StringBuilder();
            while (start >= 0) {
                tmp.append(res.substring(currentpos, start));
                tmp.append(replace);
                currentpos = start + pattern.length();
                start = res.indexOf(pattern, currentpos);
            }
            tmp.append(res.substring(currentpos));
            res = tmp.toString();
        }
        return res;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static String loadWordFile(String filePath) throws IOException {
        ParameterCheck.mandatoryString("filePath", filePath);
        FileInputStream fileInputStream = null;
        WordExtractor wordExtractor = null;
        try {
            fileInputStream = new FileInputStream(filePath);
            wordExtractor = new WordExtractor((InputStream)fileInputStream);
            String string = wordExtractor.getText();
            return string;
        }
        finally {
            if (fileInputStream != null) {
                fileInputStream.close();
            }
        }
    }

    private static String loadHtmlText(String filePath) throws IOException {
        String group;
        String encoding;
        ParameterCheck.mandatoryString("filePath", filePath);
        File sourceFile = new File(filePath);
        if (!sourceFile.exists()) {
            throw new IOException("Source file on path '" + filePath + "' does not exist.");
        }
        if (!sourceFile.isFile()) {
            throw new IOException("Source file on path '" + filePath + "' exists but is not a file");
        }
        String source = FileUtils.readFileToString(sourceFile, UTF_8_ENCODING);
        Pattern p = Pattern.compile("(charset=|encoding=)[-0-9A-Za-z_]+");
        Matcher m = p.matcher(source);
        if (m.find() && !(encoding = (group = m.group()).substring(group.indexOf("=") + 1)).equalsIgnoreCase(UTF_8_ENCODING)) {
            source = FileUtils.readFileToString(sourceFile, encoding);
        }
        try {
            String result = source.replaceAll("[\r\n\t ]+", " ");
            result = result.replaceAll("\\(\"\\[>\\]\"\\)", "");
            result = result.replaceAll("(?i)<( )*head([^>])*>", "<head>");
            result = result.replaceAll("(?i)(<( )*(/)( )*head( )*>)", "</head>");
            result = result.replaceAll("(?i)(<head>).*(</head>)", "");
            result = result.replaceAll("(?i)<( )*script([^>])*>", "");
            result = result.replaceAll("(?i)(<( )*(/)( )*script( )*>)", "");
            result = result.replaceAll("(?i)<( )*noscript([^>])*>", "<script>");
            result = result.replaceAll("(?i)(<( )*(/)( )*noscript( )*>)", "</script>");
            result = result.replaceAll("(?i)<!--( )*([^-])*( )*-->", "");
            result = result.replaceAll("(?i)<( )*style([^>])*>", "<style>");
            result = result.replaceAll("(?i)(<( )*(/)( )*style( )*>)", "</style>");
            result = result.replaceAll("(?i)(<style>).*(</style>)", "");
            result = result.replaceAll("(?i)<( )*td([^>])*>", "\t");
            result = result.replaceAll("(?i)<( )*br( )*/?()*>", "\n");
            result = result.replaceAll("(?i)<( )*li( )*>", "\n");
            result = result.replaceAll("(?i)<( )*div([^>])*>", "\n\n");
            result = result.replaceAll("(?i)<( )*tr([^>])*>", "\n\n");
            result = result.replaceAll("(?i)<( )*p([^>])*>", "\n\n");
            result = result.replaceAll("(?i)<[^>]*>", "");
            result = result.replaceAll("(?i)(\n)( )+(\n)", "\n\n");
            result = result.replaceAll("(?i)(\t)( )+(\t)", "\t\t");
            result = result.replaceAll("(?i)(\t)( )+(\n)", "\t\n");
            result = result.replaceAll("(?i)(\n)( )+(\t)", "\n\t");
            result = result.replaceAll("(?i)(\n)(\t)+(\n)", "\n\n");
            result = result.replaceAll("(?i)(\n)(\t)+", "\n\t");
            Pattern patternHex = Pattern.compile("&#(x|X)\\d+;");
            Matcher matcherHex = patternHex.matcher(result);
            StringBuffer patternStringBuffer = new StringBuffer();
            while (matcherHex.find()) {
                String matchedSequence = matcherHex.group();
                String replaceableSequence = TextIO.hexToDecimal(matchedSequence);
                matcherHex.appendReplacement(patternStringBuffer, replaceableSequence);
            }
            matcherHex.appendTail(patternStringBuffer);
            result = patternStringBuffer.toString();
            return StringEscapeUtils.unescapeHtml4(result);
        }
        catch (Exception e) {
            return source;
        }
    }

    private static String hexToDecimal(String m) {
        return "&#" + Integer.parseInt(m.substring(3, m.length() - 1), 16) + ';';
    }

    private static String loadWord2007File(String filePath) throws IOException {
        ParameterCheck.mandatoryString("filePath", filePath);
        DocxToText dtt = new DocxToText(filePath);
        return dtt.extractText();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static String loadRtfFile(String filePath) throws IOException, BadLocationException {
        ParameterCheck.mandatoryString("filePath", filePath);
        FileInputStream fis = null;
        try {
            String textToBeReturned;
            HTMLEditorKit editorKit = new HTMLEditorKit();
            Document doc = editorKit.createDefaultDocument();
            File file = new File(filePath);
            if (!file.exists()) {
                throw new IOException("Source file on path '" + filePath + "' does not exist.");
            }
            if (!file.isFile()) {
                throw new IOException("Source file on path '" + filePath + "' exists but is not a file");
            }
            fis = new FileInputStream(file);
            editorKit.read(fis, doc, 0);
            String string = textToBeReturned = doc.getText(0, doc.getLength());
            return string;
        }
        finally {
            if (fis != null) {
                fis.close();
            }
        }
    }

    private static String loadPdfFile(String filePath) throws IOException {
        ParameterCheck.mandatoryString("filePath", filePath);
        PdfReader reader = new PdfReader(filePath);
        StringBuffer stringBuffer = new StringBuffer();
        for (int page = 1; page <= reader.getNumberOfPages(); ++page) {
            SimpleTextExtractionStrategy its = new SimpleTextExtractionStrategy();
            String s = PdfTextExtractor.getTextFromPage((PdfReader)reader, (int)page, (TextExtractionStrategy)its);
            stringBuffer.append(s);
            reader.close();
        }
        return stringBuffer.toString();
    }

    public static List<String> loadPdfFileToStrings(String filePath) throws IOException {
        ArrayList<String> resultingList = new ArrayList<String>();
        ParameterCheck.mandatoryString("filePath", filePath);
        PdfReader reader = new PdfReader(filePath);
        for (int page = 1; page <= reader.getNumberOfPages(); ++page) {
            SimpleTextExtractionStrategy its = new SimpleTextExtractionStrategy();
            String s = PdfTextExtractor.getTextFromPage((PdfReader)reader, (int)page, (TextExtractionStrategy)its);
            resultingList.add(s);
            reader.close();
        }
        return resultingList;
    }

    public static String loadText(String filePath, int encodingtype, String encodingcode, String encodingname, ArrayList<String> chartable) throws IOException, BadLocationException {
        String buf = null;
        switch (Math.abs(encodingtype)) {
            case 1: {
                buf = TextIO.loadRawText(filePath);
                break;
            }
            case 2: {
                buf = TextIO.loadRawText(filePath, encodingcode);
                break;
            }
            case 3: {
                buf = TextIO.loadRtfFile(filePath);
                break;
            }
            case 4: {
                buf = TextIO.loadWordFile(filePath);
                break;
            }
            case 5: {
                buf = TextIO.loadHtmlText(filePath);
                break;
            }
            case 6: {
                buf = TextIO.loadWord2007File(filePath);
                break;
            }
            case 7: {
                buf = TextIO.loadPdfFile(filePath);
                break;
            }
            default: {
                return null;
            }
        }
        return TextIO.standardizeText(buf, chartable);
    }
}

